* Weights wic
* 26 Sept 2015 Federica
* Update 24 Feb 2016

* For each applicant, calculate the number of patents filed in different countries
* For pre-period < 1985

*cd "C:\Users\federc\Dropbox\PATSTAT\DATA"
*global tmp "D:\TEMP"
*global dta "D:\Patstat data"

global tmp "/tmp"
global dta "/users/andreas/dropbox/work/patstat_data"
cd /users/andreas/dropbox/work/patstat/data


* ----------------------------------------------
* Preliminaries
* (note: how to choose headq when several countries have the same n of entries?)
* ----------------------------------------------
use $dta/tls906/tls906_all, clear
keep person_id person_ctry_code hrm_l2_id han_id
* Identify headquarter as the most frequent country of residence
replace person_ctry_code = trim(person_ctry_code)
egen headq=mode( person_ctry_code), by( hrm_l2_id ) minmode
save $tmp/906, replace

* How frequent is it to find multiple country codes for same firm?
use $tmp/906, clear
gen diff = person_ctry_code~=headq & person_ctry~=""
collapse (max) diff, by(hrm)
count
count if diff==1


* ----------------------------------------------
* Find relevant patents
* ----------------------------------------------
use $dta/tls201/tls201, clear
gen y = substr(appln_filing_date,1,4)
destring y, replace
*drop if y<1965 | y>1985
drop if y<1965
keep appln_id appln_auth y
save $tmp/patents, replace

* ----------------------------------------------
* Find all applicants of relevant patents
* (note: potentially many applicants per patent)
* ----------------------------------------------
use $dta/tls207/tls207, clear
keep if applt_seq_nr>0
merge m:1 appln_id using $tmp/patents, keep(match) 
drop _merge

* Find harmonized name
merge m:1 person_id using $tmp/906, keep(match) 
drop _merge
*save pat_hldr_preperiod, replace
save pat_hldr_allperiod, replace


* ----------------------------------------------
* Find Nace2 code for each applicant
* The higher the weight, the stronger the relationship between an application and an industry
* The total of all weights of one application always equals 1
* ----------------------------------------------
use "$dta/tls229/tls229", clear
gen nace2 = int(nace2_code*10)/10  // 3 digit
* Merge with applicants of relevant patents
*joinby appln_id using pat_hldr_preperiod, unmatched(using)
joinby appln_id using pat_hldr_allperiod, unmatched(using) // slow
// 1.96 % not matched from pat_hldr_preperiod
keep if _merge==3
drop _merge
keep hrm_l2_id appln_id weight nace2 headq
save $tmp/matched, replace

* Find most frequent nace code for each firm
use  $tmp/matched, clear
collapse (sum) weight, by(hrm nace2)
egen rr = rank(weight), by(hrm) unique
egen tmp = max(rr), by(hrm)
gen rr2 = tmp-rr+1
keep nace2 hrm rr2
drop if rr2>1
ren nace2 nace2_
reshape wide nace2_, i(hrm) j(rr2)
save $tmp/tmpnace, replace

* Merge back to patent data
use pat_hldr_allperiod, clear 
*use pat_hldr_preperiod, clear 
merge m:1 hrm_l2_id using $tmp/tmpnace, keep(match master)
drop _merge
label var nace2_1 "Most frequent nace2"
*label var nace2_2 "Second most frequent nace2"
*label var nace2_3 "Third most frequent nace2"
*save pat_hldr_preperiod_2, replace
save pat_hldr_allperiod_2, replace

*use pat_hldr_allperiod_2, clear
use pat_hldr_preperiod_2, clear

keep if y<=1975
*keep if y<=1985

by hrm_l2_id headq appln_auth appln_id , sort: generate nvals = _n == 1
drop if nvals==0
// drop duplicate appln_id for identical combinations of hrm_l2_id headq appln_auth
collapse (count) p=appln_id, by(hrm_l2_id headq appln_auth nace2_1) 
egen tot = total(p), by(hrm)
gen w = p/tot
label var w "Weight for country"
label var p "Number of patents in appln_auth"
label var tot "Total number of patents for firm"
*save weightswic5_to1985, replace
save weightswic5_to1975, replace
*rm pat_hldr_allperiod


* -------------------------------------------------
* Construct instrumented weights I (aggregate weights) CURRENTLY USED
* -------------------------------------------------

global nn "nace"
*global nn ""

* Which countries to calculate weights for
use tariffreshape, clear
contract appln
*contract appln nace
save /tmp/cty, replace

* Calculate instrumented weights
use weightswic5_to1985,clear
keep if headq!=""
collapse (sum) p, by(headq appln $nn)
merge m:1 appln using /tmp/cty, keep(match)
*merge m:1 appln nace using /tmp/cty, keep(match)
drop _merge

egen tot = total(p), by(headq $nn)
gen w = p/tot
drop p tot
save /tmp/what1, replace

* Merge firm IDs with weights
use weightswic5_to1985,clear
contract hrm headq nace
drop _freq
merge m:1 hrm using P85, keep(match)  // Only use firms that are relevant in the regressions
drop _merge
joinby headq $nn using /tmp/what1
save weightswic5_to1985_predicted, replace

* -------------------------------------------------
* Construct instrumented weights II (gravity) - NOT USED 
* -------------------------------------------------

* Which countries to calculate weights for
use weightswic5_to1985,clear
keep if headq!=""
merge m:1 hrm using P85, keep(match)  // Only use firms that are relevant in the regressions
drop _merge
merge m:1 appln nace using tariffreshape, keep(match)
keep hrm headq appln nace p
save /tmp/firms, replace
collapse (sum) p, by(headq appln)
save /tmp/pgrav, replace
contract appln
save /tmp/cty, replace

* Square gravity data
use cepii_gravity/dist_cepii_iso2, clear
keep iso* dist contig
ren iso2_o appln_auth
merge m:1 appln using /tmp/cty, keep(match)
drop _merge
ren (appln iso2_d) (iso2_o appln_auth)
merge m:1 appln using /tmp/cty, keep(match)
drop _merge
ren appln iso2_d

* Add patents
ren (iso2_o iso2_d) (headq appln_auth)
merge 1:1 headq appln using /tmp/pgrav
replace p=0 if _merge==1
drop if _merge==2
drop _merge

gen lnp = log(p)
gen lndist = log(dist)
gen same = appln==headq
encode headq, gen(s)
encode appln, gen(d)
poisson p lndist same i.d i.s
predict lnphat, xb
gen phat = exp(lnphat)
egen tothat = total(phat), by(headq)
gen w = phat/tothat
keep headq appln w
save /tmp/what1, replace

* Merge firm IDs with weights
use /tmp/firms, clear
contract hrm headq nace
drop _freq
joinby headq using /tmp/what1
save weightswic5_to1985_predicted2, replace
